{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "SPAM_PATH = os.path.join('datasets','spam')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "HAM_DIR = os.path.join(SPAM_PATH,'easy_ham')\n",
    "SPAM_DIR = os.path.join(SPAM_PATH,'spam')\n",
    "ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name)>20]\n",
    "spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name)>20]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 可以使用python 中的 email  来解析这些邮件\n",
    "import email\n",
    "import email.policy\n",
    "\n",
    "def load_email(is_spam,filename,spam_path=SPAM_PATH):\n",
    "    directory = \"spam\" if is_spam else \"easy_ham\"\n",
    "    with open (os.path.join(spam_path,directory,filename),'rb') as f:\n",
    "        return email.parser.BytesParser(policy=email.policy.default).parse(f)   # 读取邮件的语句"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1) Fight The Risk of Cancer!\n",
      "http://www.adclick.ws/p.cfm?o=315&s=pk007\n",
      "\n",
      "2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n",
      "http://www.adclick.ws/p.cfm?o=249&s=pk007\n",
      "\n",
      "3) Get the Child Support You Deserve - Free Legal Advice\n",
      "http://www.adclick.ws/p.cfm?o=245&s=pk002\n",
      "\n",
      "4) Join the Web's Fastest Growing Singles Community\n",
      "http://www.adclick.ws/p.cfm?o=259&s=pk007\n",
      "\n",
      "5) Start Your Private Photo Album Online!\n",
      "http://www.adclick.ws/p.cfm?o=283&s=pk007\n",
      "\n",
      "Have a Wonderful Day,\n",
      "Offer Manager\n",
      "PrizeMama\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "If you wish to leave this list please use the link below.\n",
      "http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258\n",
      "\n",
      "\n",
      "-- \n",
      "Irish Linux Users' Group: ilug@linux.ie\n",
      "http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\n",
      "List maintainer: listmaster@linux.ie\n"
     ]
    }
   ],
   "source": [
    "ham_emails = [load_email(is_spam= False,filename= name ) for name in ham_filenames]\n",
    "spam_emails = [load_email(is_spam= True,filename= name ) for name in spam_filenames]\n",
    "print(spam_emails[1].get_content().strip()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Martin A posted:\n",
      "Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n",
      " limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n",
      " Mount Athos monastic community, was ideal for the patriotic sculpture. \n",
      " \n",
      " As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n",
      " museum, a restored amphitheatre and car park for admiring crowds are\n",
      "planned\n",
      "---------------------\n",
      "So is this mountain limestone or granite?\n",
      "If it's limestone, it'll weather pretty fast.\n",
      "\n",
      "------------------------ Yahoo! Groups Sponsor ---------------------~-->\n",
      "4 DVDs Free +s&p Join Now\n",
      "http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n",
      "---------------------------------------------------------------------~->\n",
      "\n",
      "To unsubscribe from this group, send an email to:\n",
      "forteana-unsubscribe@egroups.com\n",
      "\n",
      " \n",
      "\n",
      "Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/\n"
     ]
    }
   ],
   "source": [
    "print(ham_emails[1].get_content().strip()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 电子邮件实际上有很多部分，带有图像和附件（它们可以有自己的附件）。查看邮件的各种类型的结构：\n",
    "def get_email_structure(email):\n",
    "    if isinstance(email, str):\n",
    "        return email\n",
    "    payload = email.get_payload()\n",
    "    if isinstance(payload, list):\n",
    "        return \"multipart({})\".format(\", \".join([\n",
    "            get_email_structure(sub_email)\n",
    "            for sub_email in payload\n",
    "        ]))\n",
    "    else:\n",
    "        return email.get_content_type()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Counter({1: 2, 3: 2, 2: 2, 5: 1, 4: 1, 6: 1})\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "a = [1,1,5,3,2,4,6,2,3]\n",
    "b = Counter(a)\n",
    "print(b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "def structures_counter(emails):\n",
    "    structures = Counter()\n",
    "    for email in emails:\n",
    "        structure = get_email_structure(email)\n",
    "        structures[structure] += 1\n",
    "    return structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 2408),\n",
       " ('multipart(text/plain, application/pgp-signature)', 66),\n",
       " ('multipart(text/plain, text/html)', 8),\n",
       " ('multipart(text/plain, text/plain)', 4),\n",
       " ('multipart(text/plain)', 3),\n",
       " ('multipart(text/plain, application/octet-stream)', 2),\n",
       " ('multipart(text/plain, text/enriched)', 1),\n",
       " ('multipart(text/plain, application/ms-tnef, text/plain)', 1),\n",
       " ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',\n",
       "  1),\n",
       " ('multipart(text/plain, video/mng)', 1),\n",
       " ('multipart(text/plain, multipart(text/plain))', 1),\n",
       " ('multipart(text/plain, application/x-pkcs7-signature)', 1),\n",
       " ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',\n",
       "  1),\n",
       " ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',\n",
       "  1),\n",
       " ('multipart(text/plain, application/x-java-applet)', 1)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structures_counter(ham_emails).most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 218),\n",
       " ('text/html', 183),\n",
       " ('multipart(text/plain, text/html)', 45),\n",
       " ('multipart(text/html)', 20),\n",
       " ('multipart(text/plain)', 19),\n",
       " ('multipart(multipart(text/html))', 5),\n",
       " ('multipart(text/plain, image/jpeg)', 3),\n",
       " ('multipart(text/html, application/octet-stream)', 2),\n",
       " ('multipart(text/plain, application/octet-stream)', 1),\n",
       " ('multipart(text/html, text/plain)', 1),\n",
       " ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),\n",
       " ('multipart(multipart(text/plain, text/html), image/gif)', 1),\n",
       " ('multipart/alternative', 1)]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structures_counter(spam_emails).most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Return-Path : <ilug-admin@linux.ie>\n",
      "Delivered-To : zzzz@localhost.spamassassin.taint.org\n",
      "Received : from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id A7FD7454F6\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:27:38 -0400 (EDT)\n",
      "Received : from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:27:38 +0100 (IST)\n",
      "Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MCJiZ06043 for    <zzzz-ilug@jmason.org>; Thu, 22 Aug 2002 13:19:44 +0100\n",
      "Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29323; Thu, 22 Aug 2002 13:18:52 +0100\n",
      "Received : from email.qves.com ([67.104.83.251]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29282 for <ilug@linux.ie>; Thu,    22 Aug 2002 13:18:37 +0100\n",
      "X-Authentication-Warning : lugh.tuatha.org: Host [67.104.83.251] claimed to    be email.qves.com\n",
      "Received : from qvp0091 ([169.254.6.22]) by email.qves.com with Microsoft    SMTPSVC(5.0.2195.2966); Thu, 22 Aug 2002 06:18:18 -0600\n",
      "From : Slim Down <taylor@s3.serveimage.com>\n",
      "To : ilug@linux.ie\n",
      "Date : Thu, 22 Aug 2002 06:18:18 -0600\n",
      "Message-Id : <59e6301c249d5$ffb7ea20$1606fea9@freeyankeedom.com>\n",
      "MIME-Version : 1.0\n",
      "Content-Type : text/plain; charset=\"iso-8859-1\"\n",
      "Content-Transfer-Encoding : 7bit\n",
      "X-Mailer : Microsoft CDO for Windows 2000\n",
      "Thread-Index : AcJJ1f+3FWdz11AmR6uWbmQN5gGxxw==\n",
      "Content-Class : urn:content-classes:message\n",
      "X-Mimeole : Produced By Microsoft MimeOLE V6.00.2462.0000\n",
      "X-Originalarrivaltime : 22 Aug 2002 12:18:18.0699 (UTC) FILETIME=[FFB949B0:01C249D5]\n",
      "Subject : [ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206\n",
      "Sender : ilug-admin@linux.ie\n",
      "Errors-To : ilug-admin@linux.ie\n",
      "X-Mailman-Version : 1.1\n",
      "Precedence : bulk\n",
      "List-Id : Irish Linux Users' Group <ilug.linux.ie>\n",
      "X-Beenthere : ilug@linux.ie\n"
     ]
    }
   ],
   "source": [
    "# 正常邮件更多的是纯文本，而垃圾邮件有相当多的HTML。\n",
    "# 查看邮件头\n",
    "for header, value in spam_emails[1].items():\n",
    "    print(header,\":\",value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 里面可能有很多有用的信息，比如发件人的电子邮件地址（12a1mailbot1@web.de看起来很可疑），\n",
    "# 查看“主题”标题：\n",
    "\n",
    "spam_emails[1][\"Subject\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 拆分训练集和测试集合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-18-c6d32b215bf4>:4: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  X = np.array(ham_emails + spam_emails)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X = np.array(ham_emails + spam_emails)\n",
    "y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from html import unescape\n",
    "\n",
    "def html_to_plain_text(html):\n",
    "    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<a\\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<.*?>', '', text, flags=re.M | re.S)\n",
    "    text = re.sub(r'(\\s*\\n)+', '\\n', text, flags=re.M | re.S)\n",
    "    return unescape(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html><body><center>\n",
      "\n",
      "<table bgcolor=\"663399\" border=\"2\" width=\"999\" cellspacing=\"0\" cellpadding=\"0\">\n",
      "  <tr>\n",
      "    <td colspan=\"3\" width=\"999\"> <hr><font color=\"yellow\"> \n",
      "<center>\n",
      "<font size=\"7\"> \n",
      "<br><center><b>Get 12 FREE VHS or DVDs! </b><br>\n",
      "<table bgcolor=\"white\" border=\"2\" width=\"500\">\n",
      "  <tr>    <td>\n",
      " <font size=\"7\"> <font color=\"003399\"><center>Click <a href=\"http://www.bozomber.com/porno/index.html\"> HERE For Details!</a>\n",
      "<font size=\"5\"><br>\n",
      "</td></tr></table> <br> \n",
      "\n",
      "<table bgcolor=\"#CCFF33\" border=\"2\" width=\"600\">\n",
      "  <tr>    <td><center><center><font size=\"6\"><font color=\"6633CC\"><br>\n",
      "We Only Have HIGH QUALITY <br>Porno Movies to Choose From!<br><br>\n",
      " \n",
      " \"This is a <i>VERY SPECIAL, LIMITED TIME OFFER</i>.\"<br><br> Get up to 12 DVDs absolutely FREE,<br> with<a href=\"http://www.bozomber.com/porno/index.html\"> NO COMMITMENT!</a> \n",
      " <br><br>\n",
      "There's <b>no better deal anywhere</b>.<br>\n",
      "There's <i>no catches</i> and <i>no gimmicks</i>. <br>You only pay for the shipping,<br> and the DVDs  ...\n"
     ]
    }
   ],
   "source": [
    "html_spam_emails = [email for email in X_train[y_train==1]\n",
    "                   if get_email_structure(email)==\"text/html\"] \n",
    "sample_html_spam = html_spam_emails[5]\n",
    "print(sample_html_spam.get_content().strip()[:1000], \"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get 12 FREE VHS or DVDs!\n",
      "  Click  HYPERLINK  HERE For Details!\n",
      "We Only Have HIGH QUALITY Porno Movies to Choose From!\n",
      " \"This is a VERY SPECIAL, LIMITED TIME OFFER.\" Get up to 12 DVDs absolutely FREE, with HYPERLINK  NO COMMITMENT!\n",
      "There's no better deal anywhere.\n",
      "There's no catches and no gimmicks. You only pay for the shipping, and the DVDs are absolutely free!\n",
      "Take a Peak at our HYPERLINK   Full Catalog!\n",
      " High quality cum filled titles such as:\n",
      " HYPERLINK  500 Oral Cumshots 5\n",
      "Description: 500 Oral Cum Shots! I need hot jiz on my face! Will you cum in my mouth?\n",
      " Dozens of Dirty Hardcore titles such as:\n",
      " HYPERLINK  Amazing Penetrations No. 17\n",
      "Description: 4 full hours of amazing penetrations with some of the most beautiful women in porn!\n",
      " From our \"Sexiest Innocent Blondes\" collections:\n",
      " HYPERLINK  Audition Tapes\n",
      "Description: Our girls go from cute, young and innocent, to screaming sex goddess\n",
      " beggin' to have massive cocks in their tight, wet pussies and asses! ...\n"
     ]
    }
   ],
   "source": [
    "html_spam_emails = [email for email in X_train[y_train==1]\n",
    "                   if get_email_structure(email)==\"text/html\"] \n",
    "sample_html_spam = html_spam_emails[5]\n",
    "print(html_to_plain_text(sample_html_spam.get_content()).strip()[:1000], \"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 编写一个函数，它以电子邮件为输入，并以纯文本形式返回其内容，无论其格式是什么\n",
    "def email_to_text(email):\n",
    "    html = None\n",
    "    for part in email.walk():\n",
    "        ctype = part.get_content_type()\n",
    "        if not ctype in (\"text/plain\", \"text/html\"):\n",
    "            continue\n",
    "        try:\n",
    "            content = part.get_content()\n",
    "        except: # 解决编码问题\n",
    "            content = str(part.get_payload())\n",
    "        if ctype == \"text/plain\":\n",
    "            return content\n",
    "        else:\n",
    "            html = content\n",
    "    if html:\n",
    "        return html_to_plain_text(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Get 12 FREE VHS or DVDs!\n",
      "  Click  HYPERLINK  HERE For Details!\n",
      "We Only Have HIGH QUALITY Porno Movi ...\n"
     ]
    }
   ],
   "source": [
    "print(email_to_text(sample_html_spam)[:100],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computations => comput\n",
      "Computation => comput\n",
      "Computing => comput\n",
      "Computed => comput\n",
      "Compute => comput\n",
      "Compulsive => compuls\n"
     ]
    }
   ],
   "source": [
    "# 装自然语言工具包（[nltk]（http://www.nltk.org/）\n",
    "# pip install nltk\n",
    "\n",
    "# 用“url”替换url的方法 \n",
    "# pip install urlextract\n",
    "import nltk\n",
    "from urlextract import URLExtract\n",
    "\n",
    "try:\n",
    "    import nltk\n",
    "\n",
    "    stemmer = nltk.PorterStemmer()\n",
    "    for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n",
    "        print(word, \"=>\", stemmer.stem(word))\n",
    "except ImportError:\n",
    "    print(\"Error: stemming requires the NLTK module.\")\n",
    "    stemmer = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将所有处理整合到一个转换器中，我们将使用它将电子邮件转换为文字计数器。注意，我们使用python的'split（）'方法将句子拆分为单词，该方法使用空格作为单词边界。但例如，汉语和日语脚本通常不在单词之间使用空格在这个练习中没关系，因为数据集（主要）是英文的，中文可以使用结巴分词来进行拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n",
    "                 replace_urls=True, replace_numbers=True,  stemming=True):\n",
    "        self.strip_headers = strip_headers\n",
    "        self.lower_case = lower_case\n",
    "        self.remove_punctuation = remove_punctuation\n",
    "        self.replace_urls = replace_urls\n",
    "        self.replace_numbers = replace_numbers\n",
    "        self.stemming = stemming\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "    def transform(self, X, y=None):\n",
    "        X_transformed = []\n",
    "        for email in X:\n",
    "            text = email_to_text(email) or \"\"\n",
    "            if self.lower_case:\n",
    "                text = text.lower()\n",
    "            if self.replace_urls:\n",
    "                extractor = URLExtract()\n",
    "                urls = list(set(extractor.find_urls(text)))\n",
    "                urls.sort(key=lambda url: len(url), reverse=True)\n",
    "                for url in urls:  # 替换url 为 ‘URL’\n",
    "                    text = text.replace(url, \" URL \")\n",
    "            if self.replace_numbers:  # 替换数字\n",
    "                text = re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d+))?', 'NUMBER', text)\n",
    "            if self.remove_punctuation:  # 删除标点符号\n",
    "                text = re.sub(r'\\W+', ' ', text, flags=re.M)\n",
    "            word_counts = Counter(text.split())\n",
    "            if self.stemming and stemmer is not None:\n",
    "                stemmed_word_counts = Counter()\n",
    "                for word, count in word_counts.items():\n",
    "                    stemmed_word = stemmer.stem(word)\n",
    "                    stemmed_word_counts[stemmed_word] += count\n",
    "                word_counts = stemmed_word_counts\n",
    "            X_transformed.append(word_counts)\n",
    "        return np.array(X_transformed)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),\n",
       "       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom': 1, 'most': 1, 'pervert': 1, 'system': 1, 'that': 1, 'ever': 1, 'shone': 1, 'man': 1, 'absurd': 1, 'untruth': 1, 'were': 1, 'perpetr': 1, 'upon': 1, 'a': 1, 'larg': 1, 'band': 1, 'dupe': 1, 'import': 1, 'led': 1, 'paul': 1, 'first': 1, 'great': 1, 'corrupt': 1}),\n",
       "       Counter({'url': 4, 's': 3, 'group': 3, 'to': 3, 'in': 2, 'forteana': 2, 'martin': 2, 'an': 2, 'and': 2, 'we': 2, 'is': 2, 'yahoo': 2, 'unsubscrib': 2, 'y': 1, 'adamson': 1, 'wrote': 1, 'for': 1, 'altern': 1, 'rather': 1, 'more': 1, 'factual': 1, 'base': 1, 'rundown': 1, 'on': 1, 'hamza': 1, 'career': 1, 'includ': 1, 'hi': 1, 'belief': 1, 'that': 1, 'all': 1, 'non': 1, 'muslim': 1, 'yemen': 1, 'should': 1, 'be': 1, 'murder': 1, 'outright': 1, 'know': 1, 'how': 1, 'unbias': 1, 'memri': 1, 'don': 1, 't': 1, 'html': 1, 'rob': 1, 'sponsor': 1, 'number': 1, 'dvd': 1, 'free': 1, 'p': 1, 'join': 1, 'now': 1, 'from': 1, 'thi': 1, 'send': 1, 'email': 1, 'egroup': 1, 'com': 1, 'your': 1, 'use': 1, 'of': 1, 'subject': 1})],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 在一些邮件上 测试 转换器\n",
    "X_few = X_train[:3]\n",
    "X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)\n",
    "X_few_wordcounts\n",
    "\n",
    "# X_few = X_train[:3]\n",
    "# X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)\n",
    "# X_few_wordcounts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 有了单词计数，我们需要把它们转换成向量。为此，我们将构建另一个转换器，其“fit（）”方法将构建词汇表（最常用单词的有序列表），其“transform（）”方法将使用词汇表将单词计数转换为向量--稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
    "\n",
    "class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, vocabulary_size = 1000):\n",
    "        self.vocabulary_size = vocabulary_size  # 词汇量\n",
    "    def fit(self, X, y = None):\n",
    "        total_count = Counter()\n",
    "        for word_count in X:\n",
    "            for word, count in word_count.items():\n",
    "                total_count[word] += min(count, 10)\n",
    "        most_common = total_count.most_common()[:self.vocabulary_size]\n",
    "        self.most_common_ = most_common\n",
    "        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n",
    "        return self\n",
    "    def transform(self, X, y = None):\n",
    "        rows = []\n",
    "        cols = []\n",
    "        data = []\n",
    "        for row, word_count in enumerate(X):\n",
    "            for word, count in word_count.items():\n",
    "                rows.append(row) # 训练集 实例个数\n",
    "                cols.append(self.vocabulary_.get(word, 0)) # 取得单词在词汇表中的索引位置，0代表未出现在词汇表中\n",
    "                data.append(count)\n",
    "        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1)) # 输出稀疏矩阵 +1因为第一列要显示未出现在词汇表中的单词统计数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 11)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)\n",
    "X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)\n",
    "X_few_vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],\n",
       "       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],\n",
       "       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int32)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_few_vectors.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第三行第一列中的67表示第三封电子邮件包含64个不属于词汇表的单词。旁边的1表示词汇表中'of'单词在此电子邮件中出现一次。旁边的2表示'and'单词出现两次,'the'没有出现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'the': 1,\n",
       " 'of': 2,\n",
       " 'and': 3,\n",
       " 'to': 4,\n",
       " 'url': 5,\n",
       " 'all': 6,\n",
       " 'in': 7,\n",
       " 'christian': 8,\n",
       " 'on': 9,\n",
       " 'by': 10}"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "preprocess_pipeline = Pipeline([\n",
    "    (\"email_to_wordcount\", EmailToWordCounterTransformer()),\n",
    "    (\"wordcount_to_vector\", WordCounterToVectorTransformer()),\n",
    "])\n",
    "\n",
    "X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s\n",
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.981, total=   0.1s\n",
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.984, total=   0.0s\n",
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.991, total=   0.2s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\lenovo\\appdata\\local\\programs\\python\\python38\\lib\\site-packages\\sklearn\\svm\\_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  warnings.warn(\"Liblinear failed to converge, increase \"\n",
      "[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9854166666666666"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42) # 采用逻辑回归分类器\n",
    "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "精度: 96.88%\n",
      "召回: 97.89%\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import precision_score, recall_score\n",
    "\n",
    "X_test_transformed = preprocess_pipeline.transform(X_test)\n",
    "\n",
    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
    "log_clf.fit(X_train_transformed, y_train)\n",
    "\n",
    "y_pred = log_clf.predict(X_test_transformed)\n",
    "\n",
    "print(\"精度: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n",
    "print(\"召回: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
